Data preparation

Сonvert data to csv format to read it easily with R and remove unuseful columns from it

data <- read.csv("../data/IAD30284_384WellPlateDataSheet.csv", header = TRUE, skip = 14)
data <- data[data$Amplicon_ID != "Blank", ]
data$X384Well_Row <- NULL
data$X384Well_Col <- NULL
data$Gene_Symbol <- NULL
data$Genome_Version <- NULL
data$LineItem_Type <- NULL

Get amplicon sequence that lies between primers

library(jsonlite)
## 
## Attaching package: 'jsonlite'
## 
## The following object is masked from 'package:utils':
## 
##     View
library(httr)
n <- nrow(data)
chromosomes <- data$Chr
beginings <- data$Insert_Start
endings <- data$Insert_Stop
ampliconInnerPart <- rep("", n)
for (i in 1:n){
  call <- paste(c("http://178.79.134.178/refservice/references/GRCh37.p13/sequence?contig=", as.character(chromosomes[i]),
                         "&start=",as.character(beginings[i]),"&end=",as.character(endings[i])), collapse = "")
  call
  jsonData <- fromJSON(call)
  ampliconInnerPart[i] <- jsonData$sequence
}
head(ampliconInnerPart)
## [1] "TGTTTTCTTTGATCTTACAGTTGTTATTAATTGTGATTGGAGCTATAGCAGTTGTCGCAGTTTTACAACCCTACATCTTTGTTGCAACAGTGCCAGTGATAGTGGCTTTTATTATGTTGAGAGCATATTTCCTCCAAACCTCACAGCAACTCAAACAACTGGAATCTGAAG" 
## [2] "AATGCATTAATGCTATTCTGATTCTATAATATGTTTTTGCTCTCTTTTATAAATAGGATTTCTTACAAAAGCAAGAATATAAGACATTGGAATATAACTTAACGACTACAGAAGTAGTGATGGA"                                                
## [3] "ATGGGGCTAATCTGGGAGTTGTTACAGGCGTCTGCCTTCTGTGGACTTGGTTTCCTGATAGTCCTTGCCCTTTTTCAGGCTGGGCTAGGGAGAATGATGATGAAGTACAGGTAGCAACCTATTTTCATAACTTGAAAGTTTTAAAAATTATGTTTTCAAAAAGCCCAC"    
## [4] "ACATAAAACAAGCATCTATTGAAAATATCTGACAAACTCATCTTTTATTTTTGATGTGTGTGTGTGTGTGTGTGTGTTTTTTTAACAGGGATTTGGGGAATTATTTGAGAAAGCAAAACAAAACAATAACAATAGAAAAACTTCTAATGGTGATGACAGCCTCTTCTTCAGT"
## [5] "CAGGCAAGGTAGTTCTTTTGTTCTTCACTATTAAGAACTTAATTTGGTGTCCATGTCTCTTTTTTTTTCTAGTTTGTAGTGCTGGAAGGTATTTTTGGAGAAATTCTTAC"                                                              
## [6] "AGTTCATTGACATGCCAACAGAAGGTAAACCTACCAAGTCAACCAAACCATACAAGAATGGCCAACTCTCGAAAGTTATGATTA"
data$Amplicon_Inner_Part <- ampliconInnerPart

Get dangling ends

danglingEndStart <- rep("", n)
danglingEndStop <- rep("", n)
for (i in 1:n){
  call <- paste(c("http://178.79.134.178/refservice/references/GRCh37.p13/sequence?contig=", as.character(chromosomes[i]),
                         "&start=",as.character(beginings[i]-1),"&end=",as.character(beginings[i]-1)), collapse = "")
  jsonData <- fromJSON(call)
  danglingEndStart[i] <- jsonData$sequence
  call <- paste(c("http://178.79.134.178/refservice/references/GRCh37.p13/sequence?contig=", as.character(chromosomes[i]),
                        "&start=",as.character(endings[i]+1),"&end=",as.character(endings[i]+1)), collapse = "")
  jsonData <- fromJSON(call)
  danglingEndStop[i] <- jsonData$sequence
}
data$Start_Dangling_End <- danglingEndStart
data$Stop_Dangling_End <- danglingEndStop

Load results and get a proportion for each

readResultFile <- function(name, addName = FALSE, delim = "\t"){
  filePath <- paste(c("../data/", name), collapse = "")
  result <- read.table(filePath, sep = delim, header = TRUE)
  result$Gene <- NULL
  result$sam <- NULL
  result$X <- NULL
  sums <- colSums(result[,-1])
  proportion_result <- sweep(result[,-1], 2, sums, '/' )
  #proportion_result <- sapply(proportion_result, function(x) log(x))
  proportion_result$Means <- rowMeans(proportion_result)
  proportion_result$Target <- result$Target
  y <- data.frame(Target = character(127))
  y$Target <- result$Target
  ylog <- data.frame(Target = character(127))
  ylog$Target <- result$Target
  if (addName){
    logresultStr <- paste(c(name,"_logresult"), collapse = "")
    logMeansStr <- paste(c(name,"_logMeans"), collapse = "")
    resultStr <- paste(c(name,"_result"), collapse = "")
    MeansStr <- paste(c(name,"_Means"), collapse = "")
  }else{
    logresultStr <- "logresult"
    logMeansStr <- "logMeans"
    resultStr <- "result"
    MeansStr <- "Means"
  }
  #ylog[resultStr] <- log(proportion_result[,1])
  ylog[MeansStr] <- log(proportion_result$Means)
  #y[resultStr] <- proportion_result[,1]
  y[MeansStr] <- proportion_result$Means
  ylog[ylog[MeansStr] == -Inf,][MeansStr] <- -10 # min(y[ylog$Means != -Inf,]$logMeans) - 20
  #ylog[ylog[resultStr] == -Inf,][resultStr] <- -10 # min(y[ylog$result != -Inf,]$logresult) - 20
  rownames(y) <- y$Target
  rownames(ylog) <- ylog$Target
  proportion <- proportion_result
  proportion$Means <- NULL
  proportion$Target <- NULL
  rownames(proportion) <- proportion$Target
  return(list(y = y, ylog = ylog, proportions = proportion))
}

 y <- readResultFile("result.csv",FALSE, ",")$y
 ylog <- readResultFile("result.csv",FALSE, ",")$ylog
#y <- readResultFile("SN1-21_Run_17.xls",FALSE)$y
#ylog <- readResultFile("SN1-21_Run_17.xls",FALSE)$ylog

resultFileNames <- c("SN1-17_Run_14.xls", "SN1-21_Run_17.xls","SN1-23-Run_18.xls","SN1-25-Run_19.xls", "SN1-26-Run_20.xls")
totalTable <- data.frame(Target = character(127))
totalTableOfMeans <- data.frame(Target = character(127))
totalTableOflogMeans <- data.frame(Target = character(127))
for (i in 1:length(resultFileNames)){
  res <- readResultFile(resultFileNames[i], TRUE)
  namesOfColumns <- names(res$proportions)
  totalTable[namesOfColumns] <- res$proportions[namesOfColumns]
  namesOfColumns <- names(res$y)
  totalTableOfMeans[namesOfColumns] <- res$y
  totalTableOflogMeans[namesOfColumns] <- res$ylog
}
totalTable$Target <- y$Target
totalTable$sd <- apply(totalTable[, -1], 1, sd)
totalTable$mean <- apply(totalTable[, -1], 1, mean)
totalTableOfMeans$sd <- apply(totalTableOfMeans[, -1], 1, sd)
totalTableOflogMeans$sd <- apply(totalTableOflogMeans[, -1], 1, sd)
rownames(totalTable) <- totalTable$Target
rownames(totalTableOfMeans) <- totalTableOfMeans$Target
rownames(totalTableOflogMeans) <- totalTableOflogMeans$Target

Prepare features

prepared_data <- data.frame(Target = character(172))
prepared_data$Target <- data$Amplicon_ID

prepared_data$Start_Dangling_End <- data$Start_Dangling_End
prepared_data$Stop_Dangling_End <- data$Stop_Dangling_End

prepared_data$Fwd_Primer_Len <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Fwd_Primer.), function(x) nchar(x)))
prepared_data$Rev_Primer_Len <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Rev_Primer.), function(x) nchar(x)))
prepared_data$Amplicon_Len <- as.numeric(lapply(as.character(data$Amplicon_Inner_Part), function(x) nchar(x)))

prepared_data$END_3 <- as.character(lapply(data$Amplicon_Inner_Part, function(x) substr(x, nchar(x), nchar(x))))
countCharOccurrences <- function(char, s) {s2 <- gsub(char,"",s);return (nchar(s) - nchar(s2))}
countGCcontent <- function(x){ gg <- countCharOccurrences('G',x); cc <-countCharOccurrences('C',x); aa <- countCharOccurrences('A',x); tt <- countCharOccurrences('T',x); (cc+gg)*100/(aa+tt+cc+gg) }
prepared_data$Fwd_Primer_GC_content <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Fwd_Primer.), function(x)  countGCcontent(x)))
prepared_data$Rev_Primer_GC_content <- as.numeric(lapply(as.character(data$Ion_AmpliSeq_Rev_Primer.), function(x)  countGCcontent(x)))
prepared_data$Amplicon_GC_content <- as.numeric(lapply(as.character(data$Amplicon_Inner_Part), function(x) countGCcontent(x)))
rownames(prepared_data) <- prepared_data$Target
valid_names <- intersect(y$Target, data$Amplicon_ID)
prepared_data <- prepared_data[valid_names,]
prepared_data$Start_Dangling_End <- as.factor(prepared_data$Start_Dangling_End)
prepared_data$Stop_Dangling_End <- as.factor(prepared_data$Stop_Dangling_End)
prepared_data$END_3 <- as.factor(prepared_data$END_3)
y <- y[valid_names,]
ylog <- ylog[valid_names,]
totalTable <- totalTable[valid_names,]
totalTableOfMeans <- totalTableOfMeans[valid_names,]
totalTableOflogMeans <- totalTableOflogMeans[valid_names,]

Graphics of dependencies of result on different features

library(lattice)

xyplot(ylog$Means ~ prepared_data$Amplicon_GC_content,  xlab="GC-content(%)", ylab="log of amplicon percental amount", main="Amplicon GC-content")

xyplot(ylog$Means ~ prepared_data$Amplicon_Len,  xlab="Length", ylab="log of amplicon percental amount", main="Amplicon length")

xyplot(ylog$Means ~ prepared_data$Fwd_Primer_GC_content,  xlab="GC-content(%)", ylab="log of amplicon percental amount", main="Forward Primer GC-content")

xyplot(ylog$Means ~ prepared_data$Fwd_Primer_Len,  xlab="Length", ylab="log of amplicon percental amount", main="Forward Primer length")

xyplot(ylog$Means ~ prepared_data$Rev_Primer_GC_content,  xlab="GC-content(%)", ylab="log of amplicon percental amount", main="Reverse Primer GC-content")

xyplot(ylog$Means ~ prepared_data$Rev_Primer_Len,  xlab="Length", ylab="log of amplicon percental amount", main="Reverse Primer length")

xyplot(y$Means ~ prepared_data$Amplicon_GC_content,  xlab="GC-content(%)", ylab="amplicon percental amount", main="Amplicon GC-content")

xyplot(y$Means ~ prepared_data$Amplicon_Len,  xlab="Length", ylab="amplicon percental amount", main="Amplicon length")

xyplot(y$Means ~ prepared_data$Fwd_Primer_GC_content,  xlab="GC-content(%)", ylab="amplicon percental amount", main="Forward Primer GC-content")

xyplot(y$Means ~ prepared_data$Fwd_Primer_Len,  xlab="Length", ylab="amplicon percental amount", main="Forward Primer length")

xyplot(y$Means ~ prepared_data$Rev_Primer_GC_content,  xlab="GC-content(%)", ylab="amplicon percental amount", main="Reverse Primer GC-content")

xyplot(y$Means ~ prepared_data$Rev_Primer_Len,  xlab="Length", ylab="amplicon percental amount", main="Reverse Primer length")

xyplot(totalTable$mean ~ prepared_data$Amplicon_GC_content,  xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Amplicon GC-content")

xyplot(totalTable$mean ~ prepared_data$Amplicon_Len,  xlab="Length", ylab="mean of amplicon percental amount", main="Amplicon length")

xyplot(totalTable$mean ~ prepared_data$Fwd_Primer_GC_content,  xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Forward Primer GC-content")

xyplot(totalTable$mean ~ prepared_data$Fwd_Primer_Len,  xlab="Length", ylab="mean of amplicon percental amount", main="Forward Primer length")

xyplot(totalTable$mean ~ prepared_data$Rev_Primer_GC_content,  xlab="GC-content(%)", ylab="mean of  amplicon percental amount", main="Reverse Primer GC-content")

xyplot(totalTable$mean ~ prepared_data$Rev_Primer_Len,  xlab="Length", ylab="mean of amplicon percental amount", main="Reverse Primer length")

xyplot(totalTable$sd ~ prepared_data$Amplicon_GC_content,  xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Amplicon GC-content")

xyplot(totalTable$sd ~ prepared_data$Amplicon_Len,  xlab="Length", ylab="sd of amplicon percental amount", main="Amplicon length")

xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_GC_content,  xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Forward Primer GC-content")

xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_Len,  xlab="Length", ylab="sd of amplicon percental amount", main="Forward Primer length")

xyplot(totalTable$sd ~ prepared_data$Rev_Primer_GC_content,  xlab="GC-content(%)", ylab="sd of  amplicon percental amount", main="Reverse Primer GC-content")

xyplot(totalTable$sd ~ prepared_data$Rev_Primer_Len,  xlab="Length", ylab="sd of amplicon percental amount", main="Reverse Primer length")

xyplot(log(totalTable$mean) ~ prepared_data$Amplicon_GC_content,  xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Amplicon GC-content")

xyplot(log(totalTable$mean) ~ prepared_data$Amplicon_Len,  xlab="Length", ylab="mean of amplicon percental amount", main="Amplicon length")

xyplot(log(totalTable$mean) ~ prepared_data$Fwd_Primer_GC_content,  xlab="GC-content(%)", ylab="mean of amplicon percental amount", main="Forward Primer GC-content")

xyplot(log(totalTable$mean) ~ prepared_data$Fwd_Primer_Len,  xlab="Length", ylab="mean of amplicon percental amount", main="Forward Primer length")

xyplot(log(totalTable$mean) ~ prepared_data$Rev_Primer_GC_content,  xlab="GC-content(%)", ylab="mean of  amplicon percental amount", main="Reverse Primer GC-content")

xyplot(log(totalTable$mean) ~ prepared_data$Rev_Primer_Len,  xlab="Length", ylab="mean of amplicon percental amount", main="Reverse Primer length")

xyplot(totalTable$sd ~ prepared_data$Amplicon_GC_content,  xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Amplicon GC-content")

xyplot(totalTable$sd ~ prepared_data$Amplicon_Len,  xlab="Length", ylab="sd of amplicon percental amount", main="Amplicon length")

xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_GC_content,  xlab="GC-content(%)", ylab="sd of amplicon percental amount", main="Forward Primer GC-content")

xyplot(totalTable$sd ~ prepared_data$Fwd_Primer_Len,  xlab="Length", ylab="sd of amplicon percental amount", main="Forward Primer length")

xyplot(totalTable$sd ~ prepared_data$Rev_Primer_GC_content,  xlab="GC-content(%)", ylab="sd of  amplicon percental amount", main="Reverse Primer GC-content")

xyplot(totalTable$sd ~ prepared_data$Rev_Primer_Len,  xlab="Length", ylab="sd of amplicon percental amount", main="Reverse Primer length")

 histogram(~ylog$Means|factor(prepared_data$Start_Dangling_End))

Graphics of dependencies of standard variation of result

bwplot(prepared_data$END_3 ~ ylog$Means,     ylab="nucleotids", xlab="log of amplicon percental amount", 
       main="3' end")

bwplot(prepared_data$Start_Dangling_End ~ ylog$Means,     ylab="nucleotids", xlab="log of amplicon percental amount", 
       main="First dangling end")

bwplot(prepared_data$Stop_Dangling_End ~ ylog$Means,     ylab="nucleotids", xlab="log of amplicon percental amount", 
       main="Second dangling end")

bwplot(prepared_data$END_3 ~ y$Means,     ylab="nucleotids", xlab="amplicon percental amount", 
        main="3' end")

bwplot(prepared_data$Start_Dangling_End ~ y$Means,     ylab="nucleotids", xlab="amplicon percental amount", 
      main="First dangling end")

bwplot(prepared_data$Stop_Dangling_End ~ y$Means,     ylab="nucleotids", xlab="amplicon percental amount", 
       main="Second dangling end")

bwplot(prepared_data$END_3 ~ totalTable$sd,     ylab="nucleotids", xlab="standard deviation of amplicon percental amount", 
        main="3' end")

bwplot(prepared_data$Start_Dangling_End ~ totalTable$sd,     ylab="nucleotids", xlab="standard deviation of amplicon percental amount", 
      main="First dangling end")

bwplot(prepared_data$Stop_Dangling_End ~ totalTable$sd,     ylab="nucleotids", xlab="standard deviation of amplicon percental amount", 
       main="Second dangling end")

bwplot(prepared_data$END_3 ~ totalTable$mean,     ylab="nucleotids", xlab="mean of amplicon percental amount", 
        main="3' end")

bwplot(prepared_data$Start_Dangling_End ~ totalTable$mean,     ylab="nucleotids", xlab="mean of amplicon percental amount", 
      main="First dangling end")

bwplot(prepared_data$Stop_Dangling_End ~ totalTable$mean,     ylab="nucleotids", xlab="mean of amplicon percental amount", 
       main="Second dangling end")

amplicon_len <- prepared_data$Amplicon_Len
q <- quantile(amplicon_len, probs = c(1/4, 1/2, 3/4, 1))
amplicon_len[amplicon_len <= q[1]] <- q[1]
amplicon_len[amplicon_len > q[1] & amplicon_len <= q[2]] <- q[2]
amplicon_len[amplicon_len > q[2] & amplicon_len <= q[3]] <- q[3]
amplicon_len[amplicon_len > q[3]] <- q[4]
histogram(amplicon_len)

bwplot(amplicon_len ~ y$Means,     ylab="length", xlab="amplicon percental amount", 
       main="Amplicon length")

bwplot(amplicon_len ~ ylog$Means,     ylab="length", xlab="log of amplicon percental amount", 
       main="Amplicon length")

bwplot(amplicon_len ~ totalTable$sd,     ylab="length",  xlab="standard deviation of amplicon percental amount", 
       main="Amplicon length")

bwplot(amplicon_len ~ totalTable$mean,     ylab="length",  xlab="mean of amplicon percental amount", 
       main="Amplicon length")

amplicon_gc <- prepared_data$Amplicon_GC_content
q <- quantile(amplicon_gc, probs = c(1/4, 1/2, 3/4, 1))
amplicon_gc[amplicon_gc <= q[1]] <- q[1]
amplicon_gc[amplicon_gc > q[1] & amplicon_gc <= q[2]] <- q[2]
amplicon_gc[amplicon_gc > q[2] & amplicon_gc <= q[3]] <- q[3]
amplicon_gc[amplicon_gc > q[3]] <- q[4]
histogram(amplicon_gc)

bwplot(amplicon_gc ~ y$Means,     ylab="GC-content", xlab="amplicon percental amount", 
       main="Amplicon GC-content")

bwplot(amplicon_gc ~ ylog$Means,   ylab="GC-content", xlab="log of amplicon percental amount", 
       main="Amplicon GC-content")

bwplot(amplicon_gc ~ totalTable$sd,  ylab="GC-content",  xlab="standard deviation of amplicon percental amount", 
       main="Amplicon GC-content")

bwplot(amplicon_gc ~ totalTable$mean,  ylab="GC-content",  xlab="mean of amplicon percental amount", 
       main="Amplicon GC-content")

primer_len <- prepared_data$Fwd_Primer_Len
q <- quantile(primer_len, probs = c(1/4, 1/2, 3/4, 1))
primer_len[primer_len <= q[1]] <- q[1]
primer_len[primer_len > q[1] & primer_len <= q[2]] <- q[2]
primer_len[primer_len > q[2] & primer_len <= q[3]] <- q[3]
primer_len[primer_len > q[3]] <- q[4]
histogram(primer_len)

bwplot(primer_len ~ y$Means,     ylab="length", xlab="primer percental amount", 
       main="Forward Primer length")

bwplot(primer_len ~ ylog$Means,   ylab="length", xlab="log of amplicon percental amount", 
       main="Forward Primer length")

bwplot(primer_len ~ totalTable$sd,  ylab="length",  xlab="standard deviation of amplicon percental amount", 
       main="Forward Primer length")

bwplot(primer_len ~ totalTable$mean,  ylab="length",  xlab="mean of amplicon percental amount", 
       main="Forward Primer length")

primer_gc <- prepared_data$Fwd_Primer_GC_content
q <- quantile(primer_gc, probs = c(1/4, 1/2, 3/4, 1))
primer_gc[primer_gc <= q[1]] <- q[1]
primer_gc[primer_gc > q[1] & primer_gc <= q[2]] <- q[2]
primer_gc[primer_gc > q[2] & primer_gc <= q[3]] <- q[3]
primer_gc[primer_gc > q[3]] <- q[4]
bwplot(primer_gc ~ y$Means,     ylab="GC-content", xlab="primer percental amount", 
       main="Forward Primer GC-content")

bwplot(primer_gc ~ ylog$Means,   ylab="GC-content", xlab="log of amplicon percental amount", 
       main="Forward Primer GC-content")

bwplot(primer_gc ~ totalTable$sd,  ylab="GC-content",  xlab="standard deviation of amplicon percental amount", 
       main="Forward Primer GC-content")

bwplot(primer_gc ~ totalTable$mean,  ylab="GC-content",  xlab="mean of amplicon percental amount", 
       main="Forward Primer GC-content")

primer_len <- prepared_data$Rev_Primer_Len
q <- quantile(primer_len, probs = c(1/4, 1/2, 3/4, 1))
primer_len[primer_len <= q[1]] <- q[1]
primer_len[primer_len > q[1] & primer_len <= q[2]] <- q[2]
primer_len[primer_len > q[2] & primer_len <= q[3]] <- q[3]
primer_len[primer_len > q[3]] <- q[4]
histogram(primer_len)

bwplot(primer_len ~ y$Means,     ylab="length", xlab="primer percental amount", 
       main="Reverse Primer length")

bwplot(primer_len ~ ylog$Means,   ylab="length", xlab="log of amplicon percental amount", 
       main="Reverse Primer length")

bwplot(primer_len ~ totalTable$sd,  ylab="length",  xlab="standard deviation of amplicon percental amount", 
       main="Reverse Primer length")

bwplot(primer_len ~ totalTable$mean,  ylab="length",  xlab="mean of amplicon percental amount", 
       main="Reverse Primer length")

primer_gc <- prepared_data$Rev_Primer_GC_content
q <- quantile(primer_gc, probs = c(1/4, 1/2, 3/4, 1))
primer_gc[primer_gc <= q[1]] <- q[1]
primer_gc[primer_gc > q[1] & primer_gc <= q[2]] <- q[2]
primer_gc[primer_gc > q[2] & primer_gc <= q[3]] <- q[3]
primer_gc[primer_gc > q[3]] <- q[4]
bwplot(primer_gc ~ y$Means,     ylab="GC-content", xlab="primer percental amount", 
       main="Reverse Primer GC-content")

bwplot(primer_gc ~ ylog$Means,   ylab="GC-content", xlab="log of amplicon percental amount", 
       main="Reverse Primer GC-content")

bwplot(primer_gc ~ totalTable$sd,  ylab="GC-content",  xlab="standard deviation of amplicon percental amount", 
       main="Reverse Primer GC-content")

bwplot(primer_gc ~ totalTable$mean,  ylab="GC-content",  xlab="mean of amplicon percental amount", 
       main="Reverse Primer GC-content")

prepared_data$res <- y$Means
prepared_data$logres <- ylog$Means
prepared_data$sd <- totalTable$sd
prepared_data$mean <- totalTable$mean
prepared_data$MeansSD <- totalTableOfMeans$sd
write.table(prepared_data, "../data/features.tsv", col.names = TRUE, sep = "\t", row.names = FALSE)

Use all features to train model

library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
splitdf <- function(dataframe, n = 4, seed=NULL) {
    if (!is.null(seed)) set.seed(seed)
    index <- 1:nrow(dataframe)
    trainindex <- sample(index, trunc(length(index)/n))
    trainset <- dataframe[trainindex, ]
    testset <- dataframe[-trainindex, ]
    list(trainset=trainset,testset=testset)
}
splited <- splitdf(prepared_data)
trainSet <- splited$trainset
testSet <- splited$testset
formula <- res ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
##                       IncNodePurity
## Amplicon_Len           1.297057e-04
## Amplicon_GC_content    1.039724e-04
## END_3                  4.852462e-05
## Start_Dangling_End     6.530856e-05
## Stop_Dangling_End      4.758021e-05
## Fwd_Primer_Len         3.321257e-05
## Fwd_Primer_GC_content  4.192514e-05
## Rev_Primer_Len         6.474327e-05
## Rev_Primer_GC_content  5.377356e-05
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse <- function(error)
{
    sqrt(mean(error^2))
}
sd(testSet$res)
## [1] 0.004352934
rmse(predicted - testSet$res)
## [1] 0.00601293
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.004024284

Use only amplicon information and 3’ end to train model

formula <- mean ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
##                       IncNodePurity
## Amplicon_Len           9.513483e-05
## Amplicon_GC_content    8.769304e-05
## END_3                  4.572461e-05
## Start_Dangling_End     2.824424e-05
## Stop_Dangling_End      2.904317e-05
## Fwd_Primer_Len         2.524799e-05
## Fwd_Primer_GC_content  2.820977e-05
## Rev_Primer_Len         6.203297e-05
## Rev_Primer_GC_content  5.307879e-05
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$mean)
## [1] 0.005290301
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$mean)
## [1] 0.003831432

Use only amplicon information and 3’ end to train model

formula <- sd ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
##                       IncNodePurity
## Amplicon_Len           2.343244e-05
## Amplicon_GC_content    8.228964e-06
## END_3                  1.205106e-05
## Start_Dangling_End     3.004172e-06
## Stop_Dangling_End      2.068637e-06
## Fwd_Primer_Len         7.137126e-06
## Fwd_Primer_GC_content  6.770621e-06
## Rev_Primer_Len         4.540529e-06
## Rev_Primer_GC_content  5.659024e-06
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$sd)
## [1] 0.002147819
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$sd)
## [1] 0.00193279

Use only amplicon information and 3’ end to train model

formula <- res ~ Amplicon_Len + Amplicon_GC_content + END_3
rf <- randomForest(formula, trainSet)
importance(rf)
##                     IncNodePurity
## Amplicon_Len         0.0002155846
## Amplicon_GC_content  0.0001887638
## END_3                0.0001117014
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.003903581
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.003750015

Use only amplicon information to train model

formula <- res ~ Amplicon_Len + Amplicon_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
##                     IncNodePurity
## Amplicon_Len         0.0002841399
## Amplicon_GC_content  0.0002689326
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
mean(testSet$res)
## [1] 0.007852893
rmse(predicted - testSet$res)
## [1] 0.003824522
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.003651368

Use other to train model

formula <- res ~ END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+Rev_Primer_Len +Rev_Primer_GC_content
rf <- randomForest(formula, trainSet)
importance(rf)
##                       IncNodePurity
## END_3                  6.899303e-05
## Start_Dangling_End     8.206081e-05
## Stop_Dangling_End      7.034166e-05
## Fwd_Primer_Len         5.873686e-05
## Fwd_Primer_GC_content  7.620549e-05
## Rev_Primer_Len         1.059429e-04
## Rev_Primer_GC_content  9.323628e-05
fit <- lm(formula, data=trainSet)
#plot(fit)
predicted <- predict(fit, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.006482053
predicted <- predict(rf, newdata = testSet)
rmse(predicted - testSet$res)
## [1] 0.004897928

Different feature selection tests

library(FSelector)
formula <- res ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len                0.4927980
## Amplicon_GC_content         0.4013567
## END_3                       0.1636325
## Start_Dangling_End          0.2292457
## Stop_Dangling_End           0.1955937
## Fwd_Primer_Len              0.0000000
## Fwd_Primer_GC_content       0.0000000
## Rev_Primer_Len              0.0000000
## Rev_Primer_GC_content       0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance)

information.gain(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len               0.20481515
## Amplicon_GC_content        0.20752242
## END_3                      0.05912807
## Start_Dangling_End         0.11438548
## Stop_Dangling_End          0.09951913
## Fwd_Primer_Len             0.00000000
## Fwd_Primer_GC_content      0.00000000
## Rev_Primer_Len             0.00000000
## Rev_Primer_GC_content      0.00000000
gain.ratio(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len               0.20575099
## Amplicon_GC_content        0.35674389
## END_3                      0.03013050
## Start_Dangling_End         0.06257012
## Stop_Dangling_End          0.05606093
## Fwd_Primer_Len                    NaN
## Fwd_Primer_GC_content             NaN
## Rev_Primer_Len                    NaN
## Rev_Primer_GC_content             NaN
symmetrical.uncertainty(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len               0.12348679
## Amplicon_GC_content        0.14294834
## END_3                      0.02760319
## Start_Dangling_End         0.05512733
## Stop_Dangling_End          0.04858213
## Fwd_Primer_Len             0.00000000
## Fwd_Primer_GC_content      0.00000000
## Rev_Primer_Len             0.00000000
## Rev_Primer_GC_content      0.00000000
oneR(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len                2.3739993
## Amplicon_GC_content         2.6550875
## END_3                       0.7936508
## Start_Dangling_End          0.9641420
## Stop_Dangling_End           0.9178561
## Fwd_Primer_Len              2.9170625
## Fwd_Primer_GC_content       2.9170625
## Rev_Primer_Len              2.9170625
## Rev_Primer_GC_content       2.9170625
random.forest.importance(formula, data = prepared_data, importance.type = 1)
##                       attr_importance
## Amplicon_Len               31.7515991
## Amplicon_GC_content        29.6047708
## END_3                      -2.6640769
## Start_Dangling_End          0.1273096
## Stop_Dangling_End          -3.1139932
## Fwd_Primer_Len              6.6302716
## Fwd_Primer_GC_content       6.1315344
## Rev_Primer_Len              3.0663363
## Rev_Primer_GC_content       0.1135314

Different feature selection tests

library(FSelector)
formula <- MeansSD ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len                0.0000000
## Amplicon_GC_content         0.5266193
## END_3                       0.1729355
## Start_Dangling_End          0.1274934
## Stop_Dangling_End           0.1298428
## Fwd_Primer_Len              0.0000000
## Fwd_Primer_GC_content       0.0000000
## Rev_Primer_Len              0.0000000
## Rev_Primer_GC_content       0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance)

information.gain(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len               0.00000000
## Amplicon_GC_content        0.18033461
## END_3                      0.06875444
## Start_Dangling_End         0.03639421
## Stop_Dangling_End          0.03419287
## Fwd_Primer_Len             0.00000000
## Fwd_Primer_GC_content      0.00000000
## Rev_Primer_Len             0.00000000
## Rev_Primer_GC_content      0.00000000
gain.ratio(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len                      NaN
## Amplicon_GC_content        0.39746114
## END_3                      0.03503591
## Start_Dangling_End         0.01990803
## Stop_Dangling_End          0.01926146
## Fwd_Primer_Len                    NaN
## Fwd_Primer_GC_content             NaN
## Rev_Primer_Len                    NaN
## Rev_Primer_GC_content             NaN
symmetrical.uncertainty(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len               0.00000000
## Amplicon_GC_content        0.12994916
## END_3                      0.03209714
## Start_Dangling_End         0.01753995
## Stop_Dangling_End          0.01669189
## Fwd_Primer_Len             0.00000000
## Fwd_Primer_GC_content      0.00000000
## Rev_Primer_Len             0.00000000
## Rev_Primer_GC_content      0.00000000
oneR(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len                2.9535005
## Amplicon_GC_content         2.7997829
## END_3                       0.8917744
## Start_Dangling_End          0.7936508
## Stop_Dangling_End           0.9320720
## Fwd_Primer_Len              2.9535005
## Fwd_Primer_GC_content       2.9535005
## Rev_Primer_Len              2.9535005
## Rev_Primer_GC_content       2.9535005
random.forest.importance(formula, data = prepared_data, importance.type = 1)
##                       attr_importance
## Amplicon_Len               5.41421199
## Amplicon_GC_content       10.41942249
## END_3                     -4.80328263
## Start_Dangling_End        -1.41766512
## Stop_Dangling_End         -2.78939925
## Fwd_Primer_Len             4.09201432
## Fwd_Primer_GC_content      6.75474582
## Rev_Primer_Len             3.57277811
## Rev_Primer_GC_content     -0.08268964

Different feature selection tests

library(FSelector)
histogram(prepared_data$sd)

sd(prepared_data$sd)
## [1] 0.001848055
formula <- sd ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len                0.0000000
## Amplicon_GC_content         0.0000000
## END_3                       0.1441501
## Start_Dangling_End          0.1902519
## Stop_Dangling_End           0.1559919
## Fwd_Primer_Len              0.0000000
## Fwd_Primer_GC_content       0.0000000
## Rev_Primer_Len              0.0000000
## Rev_Primer_GC_content       0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance)

information.gain(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len               0.00000000
## Amplicon_GC_content        0.00000000
## END_3                      0.04685588
## Start_Dangling_End         0.09190803
## Stop_Dangling_End          0.05322211
## Fwd_Primer_Len             0.00000000
## Fwd_Primer_GC_content      0.00000000
## Rev_Primer_Len             0.00000000
## Rev_Primer_GC_content      0.00000000
gain.ratio(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len                      NaN
## Amplicon_GC_content               NaN
## END_3                      0.02387683
## Start_Dangling_End         0.05027470
## Stop_Dangling_End          0.02998098
## Fwd_Primer_Len                    NaN
## Fwd_Primer_GC_content             NaN
## Rev_Primer_Len                    NaN
## Rev_Primer_GC_content             NaN
symmetrical.uncertainty(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len               0.00000000
## Amplicon_GC_content        0.00000000
## END_3                      0.02187407
## Start_Dangling_End         0.04429447
## Stop_Dangling_End          0.02598137
## Fwd_Primer_Len             0.00000000
## Fwd_Primer_GC_content      0.00000000
## Rev_Primer_Len             0.00000000
## Rev_Primer_GC_content      0.00000000
oneR(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len                2.8965548
## Amplicon_GC_content         2.8965548
## END_3                       0.7936508
## Start_Dangling_End          0.9049111
## Stop_Dangling_End           0.8319027
## Fwd_Primer_Len              2.8965548
## Fwd_Primer_GC_content       2.8965548
## Rev_Primer_Len              2.8965548
## Rev_Primer_GC_content       2.8965548
random.forest.importance(formula, data = prepared_data, importance.type = 1)
##                       attr_importance
## Amplicon_Len                4.9995589
## Amplicon_GC_content         2.8925120
## END_3                      -3.6284379
## Start_Dangling_End          1.3762546
## Stop_Dangling_End          -6.7916164
## Fwd_Primer_Len             -2.6647553
## Fwd_Primer_GC_content      -0.7990219
## Rev_Primer_Len              9.2348298
## Rev_Primer_GC_content      13.9168295
library(FSelector)
histogram(prepared_data$mean)

sd(prepared_data$mean)
## [1] 0.003951656
formula <- mean ~ Amplicon_Len + Amplicon_GC_content + END_3+Start_Dangling_End+Stop_Dangling_End + Fwd_Primer_Len +Fwd_Primer_GC_content+ Rev_Primer_Len +Rev_Primer_GC_content
chi.squared(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len                0.5359976
## Amplicon_GC_content         0.5601954
## END_3                       0.2088283
## Start_Dangling_End          0.1638039
## Stop_Dangling_End           0.2313565
## Fwd_Primer_Len              0.0000000
## Fwd_Primer_GC_content       0.0000000
## Rev_Primer_Len              0.0000000
## Rev_Primer_GC_content       0.0000000
chi <- chi.squared(formula, data = prepared_data)
chi$names <- rownames(chi)
chi <- chi[order(-chi$attr_importance), ]
barchart( chi$names ~ chi$attr_importance)

information.gain(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len               0.23081264
## Amplicon_GC_content        0.20283463
## END_3                      0.09499156
## Start_Dangling_End         0.06088759
## Stop_Dangling_End          0.12678523
## Fwd_Primer_Len             0.00000000
## Fwd_Primer_GC_content      0.00000000
## Rev_Primer_Len             0.00000000
## Rev_Primer_GC_content      0.00000000
gain.ratio(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len               0.23510367
## Amplicon_GC_content        0.42347307
## END_3                      0.04840583
## Start_Dangling_End         0.03330618
## Stop_Dangling_End          0.07142041
## Fwd_Primer_Len                    NaN
## Fwd_Primer_GC_content             NaN
## Rev_Primer_Len                    NaN
## Rev_Primer_GC_content             NaN
symmetrical.uncertainty(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len               0.13973841
## Amplicon_GC_content        0.14484429
## END_3                      0.04434561
## Start_Dangling_End         0.02934438
## Stop_Dangling_End          0.06189259
## Fwd_Primer_Len             0.00000000
## Fwd_Primer_GC_content      0.00000000
## Rev_Primer_Len             0.00000000
## Rev_Primer_GC_content      0.00000000
oneR(formula, data = prepared_data)
##                       attr_importance
## Amplicon_Len                2.4144563
## Amplicon_GC_content         2.7553665
## END_3                       0.8749454
## Start_Dangling_End          0.7936508
## Stop_Dangling_End           1.0225918
## Fwd_Primer_Len              2.8854733
## Fwd_Primer_GC_content       2.8854733
## Rev_Primer_Len              2.8854733
## Rev_Primer_GC_content       2.8854733
random.forest.importance(formula, data = prepared_data, importance.type = 1)
##                       attr_importance
## Amplicon_Len               15.4388714
## Amplicon_GC_content        30.9788026
## END_3                      -2.3855893
## Start_Dangling_End         -1.9192058
## Stop_Dangling_End          -4.3669253
## Fwd_Primer_Len              0.4986978
## Fwd_Primer_GC_content       1.4462451
## Rev_Primer_Len              3.6241943
## Rev_Primer_GC_content       0.5760765

Feature subset selection tests

library(corrplot)
prepared_data_num <- prepared_data
prepared_data_num$Start_Dangling_End <- as.numeric(prepared_data_num$Start_Dangling_End)
prepared_data_num$Stop_Dangling_End <- as.numeric(prepared_data_num$Stop_Dangling_End)
prepared_data_num$END_3 <- as.numeric(prepared_data_num$END_3)
prepared_data.scale<- scale(prepared_data_num[2:ncol(prepared_data_num)], center=TRUE, scale=TRUE)
cor_prepared_data <- cor(prepared_data.scale)
corrplot(cor_prepared_data, order = "hclust")

library(cvTools)
## Loading required package: robustbase
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:httr':
## 
##     progress
# control <- trainControl(method="repeatedcv", number=10, repeats=3)
# model <- train(res~., data=prepared_data, method="lvq", preProcess="scale", trControl=control)
# # estimate variable importance
# importance <- varImp(model, scale=FALSE)
# # summarize importance
# print(importance)
# # plot importance
# plot(importance)
evaluator <- function(subset) {
    formula <- res ~ . 
    valid_names <- c(subset,"res")
    rf <- randomForest(formula, prepared_data[,valid_names])
    result <- cvFit(rf, data =  prepared_data[,valid_names], y =  prepared_data[,valid_names]$res, cost = rtmspe,
        K = 5, R = 10, costArgs = list(trim = 0.1), seed = 1234)
    return(1-result$cv)
}

#perform the best subset search
#subset <- exhaustive.search(names(prepared_data[-c(1, 11)]), evaluator)
#subset <- best.first.search(names(prepared_data[-c(1, 11)]), evaluator)